## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 625222 33.4 1434830 76.7 702048 37.5
## Vcells 1171653 9.0 8388608 64.0 1927980 14.8
library(magrittr)
library(data.table)
library(knitr)
library(ggplot2)
library(ComplexUpset)
`%!in%` = Negate(`%in%`)
`%nin%` = Negate(`%in%`)
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
ath.gmm = gmm[gmm$plant == 'ath', ]
setDT(ath.gmm)
ath.gmm[, plant := NULL]
ath.gmm[, source := NULL]
colnames(ath.gmm)[2:4] = paste('ath', colnames(ath.gmm)[2:4], sep = '_')note: some duplicated ids in PSS
fp = file.path('..', 'input', 'ath-annot', 'Phytozome', 'PhytozomeV12',
'early_release', 'Athaliana_447_Araport11', 'annotation')
# fn = 'Araport11_GFF3_genes_transposons.current_utf8_attributes_CB.tsv'
fn = 'Athaliana_447_Araport11.geneName.txt'
gn = data.table::fread(file.path(fp, fn), header = FALSE, fill = TRUE)
colnames(gn)[2] = 'athName'
gn$V1 = sub('\\..*', '', gn$V1)
gn = gn[!duplicated(gn), ]
fn = 'Athaliana_447_Araport11.synonym.txt'
sn = data.table::fread(file.path(fp, fn), header = FALSE, fill = TRUE)
sn[, merged_column := apply(.SD, 1, function(x) {
# Remove NA and empty strings
x = x[!is.na(x) & x != ""]
paste(x, collapse = " | ")
}), .SDcols = 2:ncol(sn)]
# Optionally, remove the original columns V2 to V15
sn[, (2:(ncol(sn)-1)) := NULL]
colnames(sn)[2] = 'athSynonims'
sn$V1 = sub('\\..*', '', sn$V1)
sn = sn[!duplicated(sn), ]
fp = file.path('..', 'input', 'SKM_2025-07-08')
fn = 'rxn-nodes-public.tsv'
pss = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
ind = grep('^name$|^all_pathways|^short_name$', colnames(pss), value = TRUE)
pss = pss[, ..ind]
ind = grep('\\[', pss$name)
pss = pss[ind, ]
pss[, ids_string := stringr::str_extract(name, "(?<=\\[)[^\\]]+(?=\\])")]
pss[, ids_list := strsplit(ids_string, split = ",")]
max_ids = max(lengths(pss$ids_list))
for (i in seq_len(max_ids)) {
pss[[paste0("id_", i)]] = sapply(pss$ids_list, function(x) ifelse(length(x) >= i, x[i], NA))
}
pss[, c("ids_string", "ids_list") := NULL]
pss_long = melt(
pss,
id.vars = c("name", "all_pathways", 'short_name'), # Columns to keep as is
measure.vars = patterns("^id_"), # Columns to melt (all starting with "id_")
variable.name = "id_num", # Name for the melted variable column
value.name = "id" # Name for the melted value column
)
pss_long = pss_long[!is.na(id) & id != ""]
pss_long[, id_num := NULL]
pss_long[, name := NULL]
pss_long$id = sub('\\..*', '', pss_long$id)
pss_long = pss_long[!duplicated(pss_long), ]
table(duplicated(pss_long$id))##
## FALSE TRUE
## 816 24
pss_long %>%
dplyr::filter(id %in% id[duplicated(id)] & stringr::str_starts(id, "^AT")) %>%
dplyr::arrange(id) %>%
print()## all_pathways short_name id
## <char> <char> <char>
## 1: Hormone - Ethylene (ET) ORA59 AT1G06160
## 2: Hormone - Ethylene (ET) ERF/ORA59 AT1G06160
## 3: Hormone - Salicylic acid (SA) TCP8 AT1G58100
## 4: Hormone - Salicylic acid (SA) TCP8,14,15 AT1G58100
## 5: Hormone - Ethylene (ET) EDF2 AT1G68840
## 6: Hormone - Ethylene (ET) ERF/EDF AT1G68840
## 7: Signalling - Heat-shock proteins (HSPs),Stress - Heat HSP70 AT3G12580
## 8: Signalling - Heat-shock proteins (HSPs) HSP AT3G12580
## 9: Hormone - Ethylene (ET) ERF1 AT3G23240
## 10: Hormone - Ethylene (ET) ERF/EDF AT3G23240
## 11: Hormone - Ethylene (ET) ERF6 AT4G17490
## 12: Hormone - Ethylene (ET) ERF/ORA59 AT4G17490
## 13: Hormone - Ethylene (ET) ERF1 AT4G17500
## 14: Hormone - Ethylene (ET) ERF/EDF AT4G17500
## 15: Signalling - Heat-shock proteins (HSPs) MED37E AT5G02500
## 16: Signalling - Heat-shock proteins (HSPs) HSP AT5G02500
## 17: Hormone - Ethylene (ET) ERF096 AT5G43410
## 18: Hormone - Ethylene (ET) ERF/EDF AT5G43410
## 19: Hormone - Ethylene (ET) ERF5 AT5G47230
## 20: Hormone - Ethylene (ET) ERF/ORA59 AT5G47230
## 21: Hormone - Ethylene (ET) ERF105 AT5G51190
## 22: Hormone - Ethylene (ET) ERF/ORA59 AT5G51190
## 23: Signalling - Heat-shock proteins (HSPs) HSP18.1-CI AT5G59720
## 24: Signalling - Heat-shock proteins (HSPs) HSP AT5G59720
## 25: Hormone - Ethylene (ET) ERF104 AT5G61600
## 26: Hormone - Ethylene (ET) ERF/EDF AT5G61600
## all_pathways short_name id
pss_long = pss_long %>%
dplyr::filter(stringr::str_starts(id, "AT")) %>%
dplyr::group_by(id) %>%
dplyr::summarise(
dplyr::across(
.cols = dplyr::everything(),
.fns = ~ {
vals = unique(na.omit(.))
if (length(vals) > 1) paste(vals, collapse = " | ")
else if (length(vals) == 1) vals
else NA_character_
}
),
.groups = "drop"
)Note: be careful with 35.2 bin matches
| Plant Name | Label | JCVI-MCScan | Compara Plants | Plaza | OrthoDB | FastOma | RBH | Mercator |
|---|---|---|---|---|---|---|---|---|
| Malus domestica | apple | mdo_GDv1 | malus_domestica_golden | mdo | mdo | mdo | mdo | mdo |
| mdo_HChap1 | ||||||||
| Prunus persica | ppe | ppe | prunus_persica | ppe | ppe | pper | ppe | pper |
| Prunus dulcis / P. amygdalus | almond | almond | prunus_dulcis | pdul | pdul | pdul | pdul | |
| Prunus avium | wild cherry | wildcherry | prunus_avium | pavi | pavi | pavi | pavi | |
| Prunus armeniaca | apricot | apricot | parm | parm | parm | parm | ||
| Prunus cerasifera | cherry plum | cherryplum | pcer | pcer | pcer | |||
| Pyrus | pear | pear | pcox | pcox | pcox | |||
| Prunus sibirica | Siberian apricot | siberianapricot | psib | psib | psib |
params_list <- list(
plantName = 'mdo'
, plantNameOut = "apple"
, plantDirOut = file.path('..', 'reports', group, "apple")
, flag = 1
)
env <- new.env()
list2env(params_list, envir = env)<environment: 0x000001614683e848>
##
##
## processing file: ./09_translate-child.Rmd
| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-11] | |…… | 11% | |…….. | 15% [unnamed-chunk-12] | |……… | 19% | |……….. | 22% [unnamed-chunk-13] | |…………. | 26% | |…………… | 30% [unnamed-chunk-14] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-15] | |………………… | 41% | |………………….. | 44% [unnamed-chunk-16] | |……………………. | 48% | |…………………….. | 52% [unnamed-chunk-17] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-18] | |………………………….. | 63% | |……………………………. | 67% [unnamed-chunk-19] | |……………………………… | 70% | |……………………………….. | 74% [unnamed-chunk-20] | |…………………………………. | 78% | |…………………………………… | 81% [unnamed-chunk-21] | |……………………………………. | 85% | |……………………………………… | 89% [unnamed-chunk-22] | |……………………………………….. | 93% | |…………………………………………. | 96% [unnamed-chunk-23] | |……………………………………………| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'compara') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'MCScanX') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else cat ('Ignore source(s):', us, '\n')
}## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
##
## compara FastOMA MCScanX OrthoDB PLAZA RBH
## 20997 77128 32036 56069 30068 37682
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID"
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 24 × 5
## from_geneID from_plant to_geneID to_plant source
## <chr> <chr> <chr> <chr> <chr>
## 1 AT4G10330 ath Maldo.hc.v1a1.ch10A.g00003 mdo FastOMA
## 2 AT4G10340 ath Maldo.hc.v1a1.ch10A.g00004 mdo FastOMA
## 3 AT5G44410 ath g1 mdo FastOMA
## 4 AT5G44440 ath g1 mdo FastOMA
## 5 AT1G03770 ath Maldo.hc.v1a1.ch10A.g02825 mdo MCScanX
## 6 AT1G03800 ath Maldo.hc.v1a1.ch10A.g02815 mdo MCScanX
## 7 ATMG00910 ath Maldo.hc.v1a1.sc45A.g49878 mdo MCScanX
## 8 ATMG01020 ath Maldo.hc.v1a1.sc45A.g49883 mdo MCScanX
## 9 AT2G46320 ath Maldo.hc.v1a1.ch1A.g26266 mdo OrthoDB
## 10 AT4G27940 ath Maldo.hc.v1a1.ch1A.g26266 mdo OrthoDB
## 11 AT2G07695 ath Maldo.hc.v1a1.sc164A.g48922 mdo OrthoDB
## 12 AT2G07695 ath Maldo.hc.v1a1.sc119A.g48697 mdo OrthoDB
## 13 AT2G28190 ath Maldo.hc.v1a1.ch6A.g38979 mdo PLAZA
## 14 AT2G07732 ath Maldo.hc.v1a1.sc90A.g49976 mdo PLAZA
## 15 AT5G17400 ath Maldo.hc.v1a1.ch9A.g48118 mdo PLAZA
## 16 AT2G47300 ath Maldo.hc.v1a1.ch17A.g24110 mdo PLAZA
## 17 AT1G01020 ath Maldo.hc.v1a1.ch7A.g41990 mdo RBH
## 18 AT1G01030 ath Maldo.hc.v1a1.ch1A.g25187 mdo RBH
## 19 ATMG01410 ath Maldo.hc.v1a1.sc36A.g49760 mdo RBH
## 20 ATMG01410 ath Maldo.hc.v1a1.sc71A.g49908 mdo RBH
## 21 AT4G39400 ath Maldo.hc.v1a1.ch2A.g27501 mdo compara
## 22 AT4G39400 ath Maldo.hc.v1a1.ch15A.g17036 mdo compara
## 23 AT1G65880 ath Maldo.hc.v1a1.ch17A.g24066 mdo compara
## 24 AT5G52820 ath Maldo.hc.v1a1.ch17A.g24067 mdo compara
rm(list = setdiff(ls(), c("df",
"ath.gmm", "gn", "sn", "pss_long",
"plantName",
"plantNameOut",
"plantDirOut",
"pattern_in",
"pattern_out",
"mercator",
"mercatorPatternIn1",
"mercatorPatternOut1",
"mercatorPatternIn2",
"mercatorPatternOut2",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2241383 119.8 4354065 232.6 3000861 160.3
## Vcells 19974681 152.4 32427968 247.5 32406934 247.3
## [1] 23046
## [1] 34410
##
## compara FastOMA MCScanX OrthoDB PLAZA RBH
## 20997 77128 32036 56069 30068 37682
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {
source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
source_cols = c("MCScanX", 'RBH', "FastOMA")
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods") +
theme(legend.position = "none")## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ComplexUpset package.
## Please report the issue at
## <https://github.com/krassowski/complex-upset/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) #
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) #
df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)
openxlsx::write.xlsx(nin,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]
colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "OrthoDB" "PLAZA" "RBH" "compara"
## [9] "from_count" "to_count" "count_evidence" "ath_BINCODE"
## [13] "ath_NAME" "ath_DESCRIPTION" "athName" "athSynonims"
## [17] "all_pathways" "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))##
## FALSE TRUE
## 117481 27
## [1] "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1"
## [16] "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "gn", "sn", "pss_long",
"plantName",
"plantNameOut",
"plantDirOut",
"pattern_in",
"pattern_out",
"mercator",
"mercatorPatternIn1",
"mercatorPatternOut1",
"mercatorPatternIn2",
"mercatorPatternOut2",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 8145738 435.1 14894365 795.5 14612893 780.5
## Vcells 127017683 969.1 200926043 1533.0 200893451 1532.7
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# split string by | then by ; and trim tokens,
# then truncate each token to first three dot-separated levels
split_tokens = function(code) {
if(is.na(code) || code == "") return(character(0))
parts = stringr::str_split(code, "\\|", simplify = TRUE)
tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
tokens = unique(stringr::str_trim(tokens))
# For each token, extract first 3 dot levels
trunc3levels = function(token) {
levels = unlist(stringr::str_split(token, "\\."))
if(length(levels) > 3) {
levels = levels[1:3]
}
paste(levels, collapse = ".")
}
truncated_tokens = sapply(tokens, trunc3levels)
unique(truncated_tokens)
}
bin_set = split_tokens(athMercator)
v4_set = split_tokens(plantXMercator)
# Tokens that are common between sets truncated to 3 levels
common_tokens = intersect(bin_set, v4_set)
# Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
if(length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
return(paste0("100% match based on ", bin_set))
}
# Check if sets are identical
if(setequal(bin_set, v4_set)) {
return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
}
# Partial match if any tokens overlap, mention those tokens
if(length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name
dplyr::ungroup()## #### #### before filter #### ####
## [1] 23046
## [1] 34410
## [1] 1 128
## [1] 1 116
## [1] 319
## [1] 288
## #### #### #### ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()
if (flag == 1) {
methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) { # make flags
methods = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
methods = c("MCScanX", 'RBH', "FastOMA")
}
match_categories = c("no match", "100% match based", "partial match")
long_dt = data.table::rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = c("no match", "100% match based", "partial match"),
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
)
)]
}), use.names = TRUE)
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match no match partial match
## 95265 13328 8915
##
## 100% match no match partial match
## 1 47630 10916 7960
## 2 13082 1663 576
## 3 8862 473 178
## 4 8957 150 104
## 5 10128 96 69
## 6 6606 30 28
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Frequency of count_evidence by MapMan4_Match",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
if (flag != 4) {
special_methods = c("OrthoDB", "RBH", "FastOMA")
} else {
special_methods = c("RBH", "FastOMA")
}
# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))
for (method in methods) {
base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE &
!(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
add_cond = rep(TRUE, nrow(dt))
if (method %in% special_methods) {
add_cond = rep(TRUE, nrow(dt))
}
candidates = which(base_cond & add_cond)
if (length(candidates) > 0) {
if (method %in% special_methods) {
for (i in candidates) {
row = dt[i]
covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
count_covered = length(covered_by)
is_candidate = FALSE
new_criteria = NULL
if (count_covered == 3) {
is_candidate = TRUE
new_criteria = "OrthoDB_FastOMA_RBH"
} else if (count_covered == 2) {
is_candidate = TRUE
new_criteria = paste(sort(covered_by), collapse = "_")
} else if (count_covered == 1) {
# Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
# reconsider
# (grepl("match based on", mapman_val, ignore.case = TRUE) &&
# !grepl("^100% match based on 35\\.2$", mapman_val)) # for flags 3
if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
is_candidate = TRUE
new_criteria = paste0(method, "_MapMan4")
# Increment count for this mapman4 assignment
mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
}
}
if (is_candidate) {
dt[i, filter_criteria := new_criteria]
# covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
covered_genes = unique(c(covered_genes, row$to_geneID))
}
}
} else {
dt[candidates, filter_criteria := method]
# covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
}
}
}
# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")## [1] "MapMan4 assignment counts per method:"
## OrthoDB_MapMan4 RBH_MapMan4 FastOMA_MapMan4
## 5384 1939 4274
## #### #### #### ####
##
## compara FastOMA_MapMan4 FastOMA_OrthoDB FastOMA_RBH
## 5278 4274 1240 1070
## MCScanX OrthoDB_FastOMA_RBH OrthoDB_MapMan4 OrthoDB_RBH
## 32036 579 5384 667
## PLAZA RBH_MapMan4 reject
## 6568 1939 58473
## #### #### #### ####
df = dt
data.table::fwrite(df,
paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'),
sep = '\t')
openxlsx::write.xlsx(df,
paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'),
asTable = TRUE)rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_kept = data.table::rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = c("no match", "100% match based", "partial match"),
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
)
)]
}), use.names = TRUE)
long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]
ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (after filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)]
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)##
## 100% match no match partial match
## 53202 3640 2193
##
## 100% match no match partial match
## 1 14805 1615 1530
## 2 7302 1354 341
## 3 6601 402 135
## 4 8013 144 95
## 5 9875 95 64
## 6 6606 30 28
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
"OrthoDB_FastOMA_RBH",
"FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
"OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Frequency by MapMan4_Match (after filter)",
x = "KG Criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'),
asTable = TRUE)
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) { # make flags
source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
source_cols = c("MCScanX", 'RBH', "FastOMA")
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
## [1] 21143
## [1] 33288
## [1] 1 93
## [1] 1 102
## [1] 30
## [1] 27
## #### #### #### ####
pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)##
## compara FastOMA_MapMan4 FastOMA_OrthoDB FastOMA_RBH
## 163 94 71 35
## MCScanX OrthoDB_FastOMA_RBH OrthoDB_MapMan4 OrthoDB_RBH
## 1336 26 122 30
## PLAZA RBH_MapMan4 reject
## 264 36 1768
openxlsx::write.xlsx(pss_long,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'),
asTable = TRUE)params_list <- list(
plantName = 'ppe'
, plantNameOut = "peach"
, plantDirOut = file.path('..', 'reports', group, "peach")
, flag = 1
)
env <- new.env()
list2env(params_list, envir = env)<environment: 0x000001618c41fae0>
##
##
## processing file: ./09_translate-child.Rmd
| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-39] | |…… | 11% | |…….. | 15% [unnamed-chunk-40] | |……… | 19% | |……….. | 22% [unnamed-chunk-41] | |…………. | 26% | |…………… | 30% [unnamed-chunk-42] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-43] | |………………… | 41% | |………………….. | 44% [unnamed-chunk-44] | |……………………. | 48% | |…………………….. | 52% [unnamed-chunk-45] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-46] | |………………………….. | 63% | |……………………………. | 67% [unnamed-chunk-47] | |……………………………… | 70% | |……………………………….. | 74% [unnamed-chunk-48] | |…………………………………. | 78% | |…………………………………… | 81% [unnamed-chunk-49] | |……………………………………. | 85% | |……………………………………… | 89% [unnamed-chunk-50] | |……………………………………….. | 93% | |…………………………………………. | 96% [unnamed-chunk-51] | |……………………………………………| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'compara') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'MCScanX') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else cat ('Ignore source(s):', us, '\n')
}## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
##
## compara FastOMA MCScanX OrthoDB PLAZA RBH
## 16193 44006 17894 38370 20791 24564
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID"
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 24 × 5
## from_geneID from_plant to_geneID to_plant source
## <chr> <chr> <chr> <chr> <chr>
## 1 AT1G12040 ath Prupe.1G000500 ppe FastOMA
## 2 AT1G62440 ath Prupe.1G000500 ppe FastOMA
## 3 AT1G61010 ath Prupe.I006100 ppe FastOMA
## 4 AT1G61010 ath Prupe.I006200 ppe FastOMA
## 5 AT1G04230 ath Prupe.1G027200 ppe MCScanX
## 6 AT1G04240 ath Prupe.1G027500 ppe MCScanX
## 7 ATCG00530 ath Prupe.7G029500 ppe MCScanX
## 8 ATCG00680 ath Prupe.7G029800 ppe MCScanX
## 9 AT3G17900 ath Prupe.1G267800 ppe OrthoDB
## 10 AT4G35230 ath Prupe.1G355500 ppe OrthoDB
## 11 AT2G07675 ath Prupe.6G146800 ppe OrthoDB
## 12 ATMG00980 ath Prupe.6G146800 ppe OrthoDB
## 13 AT3G02890 ath Prupe.2G329000 ppe PLAZA
## 14 AT5G16680 ath Prupe.2G329000 ppe PLAZA
## 15 AT4G03170 ath Prupe.4G260600 ppe PLAZA
## 16 AT4G03160 ath Prupe.4G260600 ppe PLAZA
## 17 AT1G01030 ath Prupe.5G134900 ppe RBH
## 18 AT1G01040 ath Prupe.2G200900 ppe RBH
## 19 ATMG01250 ath Prupe.6G123900 ppe RBH
## 20 ATMG01250 ath Prupe.7G164000 ppe RBH
## 21 AT5G01010 ath Prupe.6G300700 ppe compara
## 22 AT5G01020 ath Prupe.6G300300 ppe compara
## 23 AT1G80940 ath Prupe.3G103600 ppe compara
## 24 AT1G80950 ath Prupe.1G382400 ppe compara
rm(list = setdiff(ls(), c("df",
"ath.gmm", "gn", "sn", "pss_long",
"plantName",
"plantNameOut",
"plantDirOut",
"pattern_in",
"pattern_out",
"mercator",
"mercatorPatternIn1",
"mercatorPatternOut1",
"mercatorPatternIn2",
"mercatorPatternOut2",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 6160929 329.1 11915492 636.4 14894365 795.5
## Vcells 135778538 1036.0 232148417 1771.2 200925340 1533.0
## [1] 23113
## [1] 21245
##
## compara FastOMA MCScanX OrthoDB PLAZA RBH
## 16193 44006 17894 38370 20791 24564
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {
source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
source_cols = c("MCScanX", 'RBH', "FastOMA")
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods") +
theme(legend.position = "none")
# Print or/and save the plot
print(upset_plot)# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) #
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) #
df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)
openxlsx::write.xlsx(nin,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]
colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "OrthoDB" "PLAZA" "RBH" "compara"
## [9] "from_count" "to_count" "count_evidence" "ath_BINCODE"
## [13] "ath_NAME" "ath_DESCRIPTION" "athName" "athSynonims"
## [17] "all_pathways" "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))##
## FALSE TRUE
## 71222 231
## [1] "Prupe.I000100" "Prupe.I000200" "Prupe.I000200" "Prupe.I000200"
## [5] "Prupe.I000200" "Prupe.I000200" "Prupe.I000200" "Prupe.I000300"
## [9] "Prupe.I000300" "Prupe.I000300" "Prupe.I000400" "Prupe.I000400"
## [13] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [17] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [21] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [25] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [29] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [33] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [37] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [41] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [45] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [49] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000600"
## [53] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [57] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [61] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [65] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [69] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [73] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [77] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [81] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [85] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [89] "Prupe.I000600" "Prupe.I000600" "Prupe.I000700" "Prupe.I000700"
## [93] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [97] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [101] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [105] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [109] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [113] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [117] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [121] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [125] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [129] "Prupe.I000700" "Prupe.I000800" "Prupe.I000800" "Prupe.I000800"
## [133] "Prupe.I000800" "Prupe.I000800" "Prupe.I000800" "Prupe.I000800"
## [137] "Prupe.I000900" "Prupe.I000900" "Prupe.I000900" "Prupe.I000900"
## [141] "Prupe.I000900" "Prupe.I000900" "Prupe.I000900" "Prupe.I001000"
## [145] "Prupe.I001000" "Prupe.I001000" "Prupe.I001000" "Prupe.I001000"
## [149] "Prupe.I001000" "Prupe.I001000" "Prupe.I001100" "Prupe.I001100"
## [153] "Prupe.I001100" "Prupe.I001600" "Prupe.I001700" "Prupe.I001700"
## [157] "Prupe.I001700" "Prupe.I001800" "Prupe.I001800" "Prupe.I001800"
## [161] "Prupe.I001800" "Prupe.I001800" "Prupe.I001800" "Prupe.I001800"
## [165] "Prupe.I001800" "Prupe.I001800" "Prupe.I001900" "Prupe.I001900"
## [169] "Prupe.I002100" "Prupe.I002100" "Prupe.I002300" "Prupe.I002300"
## [173] "Prupe.I002300" "Prupe.I002400" "Prupe.I002400" "Prupe.I002600"
## [177] "Prupe.I002600" "Prupe.I002800" "Prupe.I002800" "Prupe.I002900"
## [181] "Prupe.I002900" "Prupe.I003000" "Prupe.I003000" "Prupe.I003100"
## [185] "Prupe.I003100" "Prupe.I003100" "Prupe.I003200" "Prupe.I003200"
## [189] "Prupe.I003200" "Prupe.I003300" "Prupe.I003400" "Prupe.I003400"
## [193] "Prupe.I003400" "Prupe.I003400" "Prupe.I003400" "Prupe.I003400"
## [197] "Prupe.I003400" "Prupe.I003800" "Prupe.I003800" "Prupe.I003900"
## [201] "Prupe.I003900" "Prupe.I004000" "Prupe.I004000" "Prupe.I004400"
## [205] "Prupe.I004400" "Prupe.I004400" "Prupe.I004400" "Prupe.I004400"
## [209] "Prupe.I004400" "Prupe.I004500" "Prupe.I004500" "Prupe.I004500"
## [213] "Prupe.I004500" "Prupe.I004500" "Prupe.I004500" "Prupe.I005000"
## [217] "Prupe.I005000" "Prupe.I005100" "Prupe.I005200" "Prupe.I005200"
## [221] "Prupe.I005200" "Prupe.I005200" "Prupe.I005200" "Prupe.I005500"
## [225] "Prupe.I005500" "Prupe.I005500" "Prupe.I005600" "Prupe.I005700"
## [229] "Prupe.I005800" "Prupe.I006100" "Prupe.I006200"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "gn", "sn", "pss_long",
"plantName",
"plantNameOut",
"plantDirOut",
"pattern_in",
"pattern_out",
"mercator",
"mercatorPatternIn1",
"mercatorPatternOut1",
"mercatorPatternIn2",
"mercatorPatternOut2",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 6312945 337.2 11915492 636.4 14894365 795.5
## Vcells 91750182 700.0 232148417 1771.2 232142054 1771.2
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# split string by | then by ; and trim tokens,
# then truncate each token to first three dot-separated levels
split_tokens = function(code) {
if(is.na(code) || code == "") return(character(0))
parts = stringr::str_split(code, "\\|", simplify = TRUE)
tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
tokens = unique(stringr::str_trim(tokens))
# For each token, extract first 3 dot levels
trunc3levels = function(token) {
levels = unlist(stringr::str_split(token, "\\."))
if(length(levels) > 3) {
levels = levels[1:3]
}
paste(levels, collapse = ".")
}
truncated_tokens = sapply(tokens, trunc3levels)
unique(truncated_tokens)
}
bin_set = split_tokens(athMercator)
v4_set = split_tokens(plantXMercator)
# Tokens that are common between sets truncated to 3 levels
common_tokens = intersect(bin_set, v4_set)
# Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
if(length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
return(paste0("100% match based on ", bin_set))
}
# Check if sets are identical
if(setequal(bin_set, v4_set)) {
return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
}
# Partial match if any tokens overlap, mention those tokens
if(length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name
dplyr::ungroup()## #### #### before filter #### ####
## [1] 23113
## [1] 21245
## [1] 1 57
## [1] 1 115
## [1] 264
## [1] 135
## #### #### #### ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()
if (flag == 1) {
methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) { # make flags
methods = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
methods = c("MCScanX", 'RBH', "FastOMA")
}
match_categories = c("no match", "100% match based", "partial match")
long_dt = data.table::rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = c("no match", "100% match based", "partial match"),
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
)
)]
}), use.names = TRUE)
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match no match partial match
## 60359 6360 4734
##
## 100% match no match partial match
## 1 29055 5410 4119
## 2 8732 560 368
## 3 5361 185 103
## 4 5521 103 68
## 5 6890 68 51
## 6 4800 34 25
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Frequency of count_evidence by MapMan4_Match",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
if (flag != 4) {
special_methods = c("OrthoDB", "RBH", "FastOMA")
} else {
special_methods = c("RBH", "FastOMA")
}
# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))
for (method in methods) {
base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE &
!(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
add_cond = rep(TRUE, nrow(dt))
if (method %in% special_methods) {
add_cond = rep(TRUE, nrow(dt))
}
candidates = which(base_cond & add_cond)
if (length(candidates) > 0) {
if (method %in% special_methods) {
for (i in candidates) {
row = dt[i]
covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
count_covered = length(covered_by)
is_candidate = FALSE
new_criteria = NULL
if (count_covered == 3) {
is_candidate = TRUE
new_criteria = "OrthoDB_FastOMA_RBH"
} else if (count_covered == 2) {
is_candidate = TRUE
new_criteria = paste(sort(covered_by), collapse = "_")
} else if (count_covered == 1) {
# Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
# reconsider
# (grepl("match based on", mapman_val, ignore.case = TRUE) &&
# !grepl("^100% match based on 35\\.2$", mapman_val)) # for flags 3
if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
is_candidate = TRUE
new_criteria = paste0(method, "_MapMan4")
# Increment count for this mapman4 assignment
mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
}
}
if (is_candidate) {
dt[i, filter_criteria := new_criteria]
# covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
covered_genes = unique(c(covered_genes, row$to_geneID))
}
}
} else {
dt[candidates, filter_criteria := method]
# covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
}
}
}
# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")## [1] "MapMan4 assignment counts per method:"
## OrthoDB_MapMan4 RBH_MapMan4 FastOMA_MapMan4
## 4153 1191 2083
## #### #### #### ####
##
## compara FastOMA_MapMan4 FastOMA_OrthoDB FastOMA_RBH
## 5084 2083 1109 232
## MCScanX OrthoDB_FastOMA_RBH OrthoDB_MapMan4 OrthoDB_RBH
## 17894 415 4153 516
## PLAZA RBH_MapMan4 reject
## 4926 1191 33850
## #### #### #### ####
df = dt
data.table::fwrite(df,
paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'),
sep = '\t')
openxlsx::write.xlsx(df,
paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'),
asTable = TRUE)rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_kept = data.table::rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = c("no match", "100% match based", "partial match"),
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
)
)]
}), use.names = TRUE)
long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]
ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (after filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)]
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)##
## 100% match no match partial match
## 34638 1556 1409
##
## 100% match no match partial match
## 1 9947 853 970
## 2 4843 366 219
## 3 3685 143 81
## 4 4775 92 63
## 5 6588 68 51
## 6 4800 34 25
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
"OrthoDB_FastOMA_RBH",
"FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
"OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Frequency by MapMan4_Match (after filter)",
x = "KG Criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'),
asTable = TRUE)
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) { # make flags
source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
source_cols = c("MCScanX", 'RBH', "FastOMA")
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
## [1] 20622
## [1] 20886
## [1] 1 50
## [1] 1 102
## [1] 7
## [1] 15
## #### #### #### ####
pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)##
## compara FastOMA_MapMan4 FastOMA_OrthoDB FastOMA_RBH
## 145 38 43 7
## MCScanX OrthoDB_FastOMA_RBH OrthoDB_MapMan4 OrthoDB_RBH
## 728 27 53 16
## PLAZA RBH_MapMan4 reject
## 166 21 1150
openxlsx::write.xlsx(pss_long,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'),
asTable = TRUE)params_list <- list(
plantName = 'pdul'
, plantNameOut = "almond"
, plantDirOut = file.path('..', 'reports', group, "almond")
, flag = 2
)
# note: in compara - geneID and prot ID are completely different
env <- new.env()
list2env(params_list, envir = env)<environment: 0x00000161b861e7a0>
##
##
## processing file: ./09_translate-child.Rmd
| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-67] | |…… | 11% | |…….. | 15% [unnamed-chunk-68] | |……… | 19% | |……….. | 22% [unnamed-chunk-69] | |…………. | 26% | |…………… | 30% [unnamed-chunk-70] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-71] | |………………… | 41% | |………………….. | 44% [unnamed-chunk-72] | |……………………. | 48% | |…………………….. | 52% [unnamed-chunk-73] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-74] | |………………………….. | 63% | |……………………………. | 67% [unnamed-chunk-75] | |……………………………… | 70% | |……………………………….. | 74% [unnamed-chunk-76] | |…………………………………. | 78% | |…………………………………… | 81% [unnamed-chunk-77] | |……………………………………. | 85% | |……………………………………… | 89% [unnamed-chunk-78] | |……………………………………….. | 93% | |…………………………………………. | 96% [unnamed-chunk-79] | |……………………………………………| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'compara') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'MCScanX') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else cat ('Ignore source(s):', us, '\n')
}## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
##
## compara FastOMA MCScanX OrthoDB RBH
## 15975 42927 20148 35734 24829
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID"
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 20 × 5
## from_geneID from_plant to_geneID to_plant source
## <chr> <chr> <chr> <chr> <chr>
## 1 AT2G05620 ath TexasF1_G1000 pdul FastOMA
## 2 AT3G48880 ath TexasF1_G10001 pdul FastOMA
## 3 AT3G48890 ath TexasF1_G9999 pdul FastOMA
## 4 AT5G52240 ath TexasF1_G9999 pdul FastOMA
## 5 AT1G04220 ath TexasF1_G767 pdul MCScanX
## 6 AT1G04230 ath TexasF1_G779 pdul MCScanX
## 7 ATCG00170 ath TexasF1_G22620 pdul MCScanX
## 8 ATCG00500 ath TexasF1_G22624 pdul MCScanX
## 9 AT1G23390 ath TexasF1_G3359 pdul OrthoDB
## 10 AT5G19210 ath TexasF1_G2060 pdul OrthoDB
## 11 AT3G51810 ath TexasF1_G23162 pdul OrthoDB
## 12 AT2G28815 ath TexasF1_G6420 pdul OrthoDB
## 13 AT1G01030 ath TexasF1_G18833 pdul RBH
## 14 AT1G01040 ath TexasF1_G9057 pdul RBH
## 15 ATMG00860 ath TexasF1_G25095 pdul RBH
## 16 ATMG01250 ath TexasF1_G22012 pdul RBH
## 17 AT4G37540 ath TexasF1_G22582 pdul compara
## 18 AT5G67420 ath TexasF1_G22582 pdul compara
## 19 AT4G15960 ath TexasF1_G27715 pdul compara
## 20 AT3G45140 ath TexasF1_G6796 pdul compara
rm(list = setdiff(ls(), c("df",
"ath.gmm", "gn", "sn", "pss_long",
"plantName",
"plantNameOut",
"plantDirOut",
"pattern_in",
"pattern_out",
"mercator",
"mercatorPatternIn1",
"mercatorPatternOut1",
"mercatorPatternIn2",
"mercatorPatternOut2",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 5092078 272.0 11915492 636.4 14894365 795.5
## Vcells 96743945 738.1 232148417 1771.2 232142054 1771.2
## [1] 22451
## [1] 20903
##
## compara FastOMA MCScanX OrthoDB RBH
## 15975 42927 20148 35734 24829
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {
source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
source_cols = c("MCScanX", 'RBH', "FastOMA")
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods") +
theme(legend.position = "none")
# Print or/and save the plot
print(upset_plot)# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) #
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) #
df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)
openxlsx::write.xlsx(nin,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]
colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "OrthoDB" "RBH" "compara" "from_count"
## [9] "to_count" "count_evidence" "ath_BINCODE" "ath_NAME"
## [13] "ath_DESCRIPTION" "athName" "athSynonims" "all_pathways"
## [17] "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))##
## FALSE
## 67030
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "gn", "sn", "pss_long",
"plantName",
"plantNameOut",
"plantDirOut",
"pattern_in",
"pattern_out",
"mercator",
"mercatorPatternIn1",
"mercatorPatternOut1",
"mercatorPatternIn2",
"mercatorPatternOut2",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4648164 248.3 11915492 636.4 14894365 795.5
## Vcells 58956976 449.9 185724412 1417.0 232155515 1771.3
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# split string by | then by ; and trim tokens,
# then truncate each token to first three dot-separated levels
split_tokens = function(code) {
if(is.na(code) || code == "") return(character(0))
parts = stringr::str_split(code, "\\|", simplify = TRUE)
tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
tokens = unique(stringr::str_trim(tokens))
# For each token, extract first 3 dot levels
trunc3levels = function(token) {
levels = unlist(stringr::str_split(token, "\\."))
if(length(levels) > 3) {
levels = levels[1:3]
}
paste(levels, collapse = ".")
}
truncated_tokens = sapply(tokens, trunc3levels)
unique(truncated_tokens)
}
bin_set = split_tokens(athMercator)
v4_set = split_tokens(plantXMercator)
# Tokens that are common between sets truncated to 3 levels
common_tokens = intersect(bin_set, v4_set)
# Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
if(length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
return(paste0("100% match based on ", bin_set))
}
# Check if sets are identical
if(setequal(bin_set, v4_set)) {
return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
}
# Partial match if any tokens overlap, mention those tokens
if(length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name
dplyr::ungroup()## #### #### before filter #### ####
## [1] 22451
## [1] 20903
## [1] 1 150
## [1] 1 113
## [1] 171
## [1] 120
## #### #### #### ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()
if (flag == 1) {
methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) { # make flags
methods = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
methods = c("MCScanX", 'RBH', "FastOMA")
}
match_categories = c("no match", "100% match based", "partial match")
long_dt = data.table::rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = c("no match", "100% match based", "partial match"),
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
)
)]
}), use.names = TRUE)
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match no match partial match
## 54341 7625 5064
##
## 100% match no match partial match
## 1 26684 6154 4134
## 2 8165 848 468
## 3 5584 289 176
## 4 6775 187 146
## 5 7133 147 140
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Frequency of count_evidence by MapMan4_Match",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
if (flag != 4) {
special_methods = c("OrthoDB", "RBH", "FastOMA")
} else {
special_methods = c("RBH", "FastOMA")
}
# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))
for (method in methods) {
base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE &
!(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
add_cond = rep(TRUE, nrow(dt))
if (method %in% special_methods) {
add_cond = rep(TRUE, nrow(dt))
}
candidates = which(base_cond & add_cond)
if (length(candidates) > 0) {
if (method %in% special_methods) {
for (i in candidates) {
row = dt[i]
covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
count_covered = length(covered_by)
is_candidate = FALSE
new_criteria = NULL
if (count_covered == 3) {
is_candidate = TRUE
new_criteria = "OrthoDB_FastOMA_RBH"
} else if (count_covered == 2) {
is_candidate = TRUE
new_criteria = paste(sort(covered_by), collapse = "_")
} else if (count_covered == 1) {
# Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
# reconsider
# (grepl("match based on", mapman_val, ignore.case = TRUE) &&
# !grepl("^100% match based on 35\\.2$", mapman_val)) # for flags 3
if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
is_candidate = TRUE
new_criteria = paste0(method, "_MapMan4")
# Increment count for this mapman4 assignment
mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
}
}
if (is_candidate) {
dt[i, filter_criteria := new_criteria]
# covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
covered_genes = unique(c(covered_genes, row$to_geneID))
}
}
} else {
dt[candidates, filter_criteria := method]
# covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
}
}
}
# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")## [1] "MapMan4 assignment counts per method:"
## OrthoDB_MapMan4 RBH_MapMan4 FastOMA_MapMan4
## 4025 1664 2469
## #### #### #### ####
##
## compara FastOMA_MapMan4 FastOMA_OrthoDB FastOMA_RBH
## 4234 2469 1038 583
## MCScanX OrthoDB_FastOMA_RBH OrthoDB_MapMan4 OrthoDB_RBH
## 20148 722 4025 692
## RBH_MapMan4 reject
## 1664 31455
## #### #### #### ####
df = dt
data.table::fwrite(df,
paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'),
sep = '\t')
openxlsx::write.xlsx(df,
paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'),
asTable = TRUE)rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_kept = data.table::rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = c("no match", "100% match based", "partial match"),
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
)
)]
}), use.names = TRUE)
long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]
ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (after filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)]
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)##
## 100% match no match partial match
## 31939 1865 1771
##
## 100% match no match partial match
## 1 9222 690 1054
## 2 4772 600 297
## 3 4402 249 144
## 4 6410 179 136
## 5 7133 147 140
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
"OrthoDB_FastOMA_RBH",
"FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
"OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Frequency by MapMan4_Match (after filter)",
x = "KG Criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'),
asTable = TRUE)
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) { # make flags
source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
source_cols = c("MCScanX", 'RBH', "FastOMA")
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
## [1] 20096
## [1] 20187
## [1] 1 51
## [1] 1 102
## [1] 5
## [1] 11
## #### #### #### ####
pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)##
## compara FastOMA_MapMan4 FastOMA_OrthoDB FastOMA_RBH
## 118 52 53 25
## MCScanX OrthoDB_FastOMA_RBH OrthoDB_MapMan4 OrthoDB_RBH
## 843 42 69 19
## RBH_MapMan4 reject
## 40 1144
openxlsx::write.xlsx(pss_long,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'),
asTable = TRUE)params_list <- list(
plantName = 'pavi'
, plantNameOut = "wildcherry"
, plantDirOut = file.path('..', 'reports', group, "wildcherry")
, flag = 2
)
# note: in compara - geneID and prot ID are completely different
env <- new.env()
list2env(params_list, envir = env)<environment: 0x000001614e2ac588>
##
##
## processing file: ./09_translate-child.Rmd
| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-95] | |…… | 11% | |……. | 15% [unnamed-chunk-96] | |……… | 19% | |……….. | 22% [unnamed-chunk-97] | |…………. | 26% | |…………… | 30% [unnamed-chunk-98] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-99] | |……………….. | 41% | |…………………. | 44% [unnamed-chunk-100] | |…………………… | 48% | |…………………….. | 52% [unnamed-chunk-101] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-102] | |…………………………. | 63% | |…………………………… | 67% [unnamed-chunk-103] | |…………………………….. | 70% | |………………………………. | 74% [unnamed-chunk-104] | |………………………………… | 78% | |………………………………….. | 81% [unnamed-chunk-105] | |……………………………………. | 85% | |…………………………………….. | 89% [unnamed-chunk-106] | |………………………………………. | 93% | |………………………………………… | 96% [unnamed-chunk-107] | |…………………………………………..| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'compara') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'MCScanX') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else cat ('Ignore source(s):', us, '\n')
}## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
##
## compara FastOMA MCScanX OrthoDB RBH
## 4367 45924 19708 38228 25594
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID"
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 20 × 5
## from_geneID from_plant to_geneID to_plant source
## <chr> <chr> <chr> <chr> <chr>
## 1 AT1G12040 ath FUN_000050 pavi FastOMA
## 2 AT1G62440 ath FUN_000050 pavi FastOMA
## 3 AT2G43840 ath FUN_040335 pavi FastOMA
## 4 AT2G44050 ath FUN_040336 pavi FastOMA
## 5 AT1G04220 ath FUN_000300 pavi MCScanX
## 6 AT1G04230 ath FUN_000316 pavi MCScanX
## 7 ATMG01190 ath FUN_026738 pavi MCScanX
## 8 ATMG00910 ath FUN_040205 pavi MCScanX
## 9 AT4G39370 ath FUN_020728 pavi OrthoDB
## 10 AT3G06350 ath FUN_020749 pavi OrthoDB
## 11 AT4G24220 ath FUN_029917 pavi OrthoDB
## 12 AT4G24220 ath FUN_029968 pavi OrthoDB
## 13 AT1G01030 ath FUN_025493 pavi RBH
## 14 AT1G01040 ath FUN_011748 pavi RBH
## 15 ATMG01250 ath FUN_040221 pavi RBH
## 16 ATMG01360 ath FUN_026804 pavi RBH
## 17 AT2G22690 ath FUN_021390 pavi compara
## 18 AT4G37880 ath FUN_021390 pavi compara
## 19 AT4G39770 ath FUN_021012 pavi compara
## 20 AT4G21740 ath FUN_032538 pavi compara
rm(list = setdiff(ls(), c("df",
"ath.gmm", "gn", "sn", "pss_long",
"plantName",
"plantNameOut",
"plantDirOut",
"pattern_in",
"pattern_out",
"mercator",
"mercatorPatternIn1",
"mercatorPatternOut1",
"mercatorPatternIn2",
"mercatorPatternOut2",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4096225 218.8 11915492 636.4 14894365 795.5
## Vcells 62787153 479.1 185724412 1417.0 232155515 1771.3
## [1] 22167
## [1] 21950
##
## compara FastOMA MCScanX OrthoDB RBH
## 4367 45924 19708 38228 25594
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {
source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
source_cols = c("MCScanX", 'RBH', "FastOMA")
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods") +
theme(legend.position = "none")
# Print or/and save the plot
print(upset_plot)# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) #
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) #
df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)
openxlsx::write.xlsx(nin,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]
colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "OrthoDB" "RBH" "compara" "from_count"
## [9] "to_count" "count_evidence" "ath_BINCODE" "ath_NAME"
## [13] "ath_DESCRIPTION" "athName" "athSynonims" "all_pathways"
## [17] "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))##
## FALSE TRUE
## 71643 2
## [1] "FUN_040149" "FUN_040149"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "gn", "sn", "pss_long",
"plantName",
"plantNameOut",
"plantDirOut",
"pattern_in",
"pattern_out",
"mercator",
"mercatorPatternIn1",
"mercatorPatternOut1",
"mercatorPatternIn2",
"mercatorPatternOut2",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4707333 251.4 11915492 636.4 14894365 795.5
## Vcells 60032963 458.1 185729008 1417.0 232155515 1771.3
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# split string by | then by ; and trim tokens,
# then truncate each token to first three dot-separated levels
split_tokens = function(code) {
if(is.na(code) || code == "") return(character(0))
parts = stringr::str_split(code, "\\|", simplify = TRUE)
tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
tokens = unique(stringr::str_trim(tokens))
# For each token, extract first 3 dot levels
trunc3levels = function(token) {
levels = unlist(stringr::str_split(token, "\\."))
if(length(levels) > 3) {
levels = levels[1:3]
}
paste(levels, collapse = ".")
}
truncated_tokens = sapply(tokens, trunc3levels)
unique(truncated_tokens)
}
bin_set = split_tokens(athMercator)
v4_set = split_tokens(plantXMercator)
# Tokens that are common between sets truncated to 3 levels
common_tokens = intersect(bin_set, v4_set)
# Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
if(length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
return(paste0("100% match based on ", bin_set))
}
# Check if sets are identical
if(setequal(bin_set, v4_set)) {
return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
}
# Partial match if any tokens overlap, mention those tokens
if(length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name
dplyr::ungroup()## #### #### before filter #### ####
## [1] 22167
## [1] 21950
## [1] 1 122
## [1] 1 115
## [1] 242
## [1] 131
## #### #### #### ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()
if (flag == 1) {
methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) { # make flags
methods = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
methods = c("MCScanX", 'RBH', "FastOMA")
}
match_categories = c("no match", "100% match based", "partial match")
long_dt = data.table::rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = c("no match", "100% match based", "partial match"),
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
)
)]
}), use.names = TRUE)
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match no match partial match
## 56863 8890 5892
##
## 100% match no match partial match
## 1 28791 7489 4881
## 2 9611 946 572
## 3 8287 298 243
## 4 8423 131 163
## 5 1751 26 33
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Frequency of count_evidence by MapMan4_Match",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
if (flag != 4) {
special_methods = c("OrthoDB", "RBH", "FastOMA")
} else {
special_methods = c("RBH", "FastOMA")
}
# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))
for (method in methods) {
base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE &
!(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
add_cond = rep(TRUE, nrow(dt))
if (method %in% special_methods) {
add_cond = rep(TRUE, nrow(dt))
}
candidates = which(base_cond & add_cond)
if (length(candidates) > 0) {
if (method %in% special_methods) {
for (i in candidates) {
row = dt[i]
covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
count_covered = length(covered_by)
is_candidate = FALSE
new_criteria = NULL
if (count_covered == 3) {
is_candidate = TRUE
new_criteria = "OrthoDB_FastOMA_RBH"
} else if (count_covered == 2) {
is_candidate = TRUE
new_criteria = paste(sort(covered_by), collapse = "_")
} else if (count_covered == 1) {
# Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
# reconsider
# (grepl("match based on", mapman_val, ignore.case = TRUE) &&
# !grepl("^100% match based on 35\\.2$", mapman_val)) # for flags 3
if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
is_candidate = TRUE
new_criteria = paste0(method, "_MapMan4")
# Increment count for this mapman4 assignment
mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
}
}
if (is_candidate) {
dt[i, filter_criteria := new_criteria]
# covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
covered_genes = unique(c(covered_genes, row$to_geneID))
}
}
} else {
dt[candidates, filter_criteria := method]
# covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
}
}
}
# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")## [1] "MapMan4 assignment counts per method:"
## OrthoDB_MapMan4 RBH_MapMan4 FastOMA_MapMan4
## 5359 1923 3416
## #### #### #### ####
##
## compara FastOMA_MapMan4 FastOMA_OrthoDB FastOMA_RBH
## 1309 3416 2133 891
## MCScanX OrthoDB_FastOMA_RBH OrthoDB_MapMan4 OrthoDB_RBH
## 19708 3052 5359 1307
## RBH_MapMan4 reject
## 1923 32547
## #### #### #### ####
df = dt
data.table::fwrite(df,
paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'),
sep = '\t')
openxlsx::write.xlsx(df,
paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'),
asTable = TRUE)rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_kept = data.table::rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = c("no match", "100% match based", "partial match"),
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
)
)]
}), use.names = TRUE)
long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]
ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (after filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)]
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)##
## 100% match no match partial match
## 34836 1933 2329
##
## 100% match no match partial match
## 1 11001 732 1533
## 2 6509 770 385
## 3 7254 276 215
## 4 8321 129 163
## 5 1751 26 33
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
"OrthoDB_FastOMA_RBH",
"FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
"OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Frequency by MapMan4_Match (after filter)",
x = "KG Criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'),
asTable = TRUE)
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) { # make flags
source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
source_cols = c("MCScanX", 'RBH', "FastOMA")
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
## [1] 19897
## [1] 20960
## [1] 1 59
## [1] 1 102
## [1] 11
## [1] 15
## #### #### #### ####
pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)##
## compara FastOMA_MapMan4 FastOMA_OrthoDB FastOMA_RBH
## 47 116 79 36
## MCScanX OrthoDB_FastOMA_RBH OrthoDB_MapMan4 OrthoDB_RBH
## 789 84 116 61
## RBH_MapMan4 reject
## 36 1385
openxlsx::write.xlsx(pss_long,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'),
asTable = TRUE)params_list <- list(
plantName = 'parm'
, plantNameOut = "apricot"
, plantDirOut = file.path('..', 'reports', group, "apricot")
, flag = 3
)
# note: in compara - geneID and prot ID are completely different
env <- new.env()
list2env(params_list, envir = env)<environment: 0x000001618d84bd68>
##
##
## processing file: ./09_translate-child.Rmd
| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-123] | |…… | 11% | |……. | 15% [unnamed-chunk-124] | |……… | 19% | |……….. | 22% [unnamed-chunk-125] | |…………. | 26% | |…………… | 30% [unnamed-chunk-126] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-127] | |……………….. | 41% | |…………………. | 44% [unnamed-chunk-128] | |…………………… | 48% | |…………………….. | 52% [unnamed-chunk-129] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-130] | |…………………………. | 63% | |…………………………… | 67% [unnamed-chunk-131] | |…………………………….. | 70% | |………………………………. | 74% [unnamed-chunk-132] | |………………………………… | 78% | |………………………………….. | 81% [unnamed-chunk-133] | |……………………………………. | 85% | |…………………………………….. | 89% [unnamed-chunk-134] | |………………………………………. | 93% | |………………………………………… | 96% [unnamed-chunk-135] | |…………………………………………..| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'compara') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'MCScanX') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else cat ('Ignore source(s):', us, '\n')
}## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
##
## FastOMA MCScanX OrthoDB RBH
## 43038 18615 52084 25259
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID"
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 16 × 5
## from_geneID from_plant to_geneID to_plant source
## <chr> <chr> <chr> <chr> <chr>
## 1 AT1G12040 ath PruarM.1G000500 parm FastOMA
## 2 AT1G62440 ath PruarM.1G000500 parm FastOMA
## 3 AT2G47410 ath PruarM.8G368500 parm FastOMA
## 4 AT4G02060 ath PruarM.8G368700 parm FastOMA
## 5 AT1G04400 ath PruarM.1G051900 parm MCScanX
## 6 AT1G04410 ath PruarM.1G052500 parm MCScanX
## 7 AT5G64410 ath PruarM.8G146300 parm MCScanX
## 8 AT5G64410 ath PruarM.8G146200 parm MCScanX
## 9 AT5G10270 ath PruarM.1G279700 parm OrthoDB
## 10 AT5G64960 ath PruarM.1G279700 parm OrthoDB
## 11 AT2G15790 ath PruarM.8G195500 parm OrthoDB
## 12 AT4G34660 ath PruarM.8G163400 parm OrthoDB
## 13 AT1G01010 ath PruarM.2G368400 parm RBH
## 14 AT1G01030 ath PruarM.5G193300 parm RBH
## 15 ATMG01360 ath PruarM.4G189900 parm RBH
## 16 ATMG01360 ath PruarM.4G190100 parm RBH
rm(list = setdiff(ls(), c("df",
"ath.gmm", "gn", "sn", "pss_long",
"plantName",
"plantNameOut",
"plantDirOut",
"pattern_in",
"pattern_out",
"mercator",
"mercatorPatternIn1",
"mercatorPatternOut1",
"mercatorPatternIn2",
"mercatorPatternOut2",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4146652 221.5 11915492 636.4 14894365 795.5
## Vcells 64254829 490.3 185767249 1417.3 232155515 1771.3
## [1] 22351
## [1] 22551
##
## FastOMA MCScanX OrthoDB RBH
## 43038 18615 52084 25259
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {
source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
source_cols = c("MCScanX", 'RBH', "FastOMA")
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods") +
theme(legend.position = "none")
# Print or/and save the plot
print(upset_plot)# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) #
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) #
df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)
openxlsx::write.xlsx(nin,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]
colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "OrthoDB" "RBH" "from_count" "to_count"
## [9] "count_evidence" "ath_BINCODE" "ath_NAME" "ath_DESCRIPTION"
## [13] "athName" "athSynonims" "all_pathways" "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))##
## FALSE
## 84042
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "gn", "sn", "pss_long",
"plantName",
"plantNameOut",
"plantDirOut",
"pattern_in",
"pattern_out",
"mercator",
"mercatorPatternIn1",
"mercatorPatternOut1",
"mercatorPatternIn2",
"mercatorPatternOut2",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4048404 216.3 11915492 636.4 14894365 795.5
## Vcells 47943365 365.8 148613800 1133.9 232155515 1771.3
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# split string by | then by ; and trim tokens,
# then truncate each token to first three dot-separated levels
split_tokens = function(code) {
if(is.na(code) || code == "") return(character(0))
parts = stringr::str_split(code, "\\|", simplify = TRUE)
tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
tokens = unique(stringr::str_trim(tokens))
# For each token, extract first 3 dot levels
trunc3levels = function(token) {
levels = unlist(stringr::str_split(token, "\\."))
if(length(levels) > 3) {
levels = levels[1:3]
}
paste(levels, collapse = ".")
}
truncated_tokens = sapply(tokens, trunc3levels)
unique(truncated_tokens)
}
bin_set = split_tokens(athMercator)
v4_set = split_tokens(plantXMercator)
# Tokens that are common between sets truncated to 3 levels
common_tokens = intersect(bin_set, v4_set)
# Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
if(length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
return(paste0("100% match based on ", bin_set))
}
# Check if sets are identical
if(setequal(bin_set, v4_set)) {
return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
}
# Partial match if any tokens overlap, mention those tokens
if(length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name
dplyr::ungroup()## #### #### before filter #### ####
## [1] 22351
## [1] 22551
## [1] 1 267
## [1] 1 113
## [1] 344
## [1] 392
## #### #### #### ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()
if (flag == 1) {
methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) { # make flags
methods = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
methods = c("MCScanX", 'RBH', "FastOMA")
}
match_categories = c("no match", "100% match based", "partial match")
long_dt = data.table::rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = c("no match", "100% match based", "partial match"),
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
)
)]
}), use.names = TRUE)
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match no match partial match
## 67784 10212 6046
##
## 100% match no match partial match
## 1 41417 8961 4950
## 2 9532 896 600
## 3 8599 227 306
## 4 8236 128 190
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Frequency of count_evidence by MapMan4_Match",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
if (flag != 4) {
special_methods = c("OrthoDB", "RBH", "FastOMA")
} else {
special_methods = c("RBH", "FastOMA")
}
# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))
for (method in methods) {
base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE &
!(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
add_cond = rep(TRUE, nrow(dt))
if (method %in% special_methods) {
add_cond = rep(TRUE, nrow(dt))
}
candidates = which(base_cond & add_cond)
if (length(candidates) > 0) {
if (method %in% special_methods) {
for (i in candidates) {
row = dt[i]
covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
count_covered = length(covered_by)
is_candidate = FALSE
new_criteria = NULL
if (count_covered == 3) {
is_candidate = TRUE
new_criteria = "OrthoDB_FastOMA_RBH"
} else if (count_covered == 2) {
is_candidate = TRUE
new_criteria = paste(sort(covered_by), collapse = "_")
} else if (count_covered == 1) {
# Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
# reconsider
# (grepl("match based on", mapman_val, ignore.case = TRUE) &&
# !grepl("^100% match based on 35\\.2$", mapman_val)) # for flags 3
if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
is_candidate = TRUE
new_criteria = paste0(method, "_MapMan4")
# Increment count for this mapman4 assignment
mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
}
}
if (is_candidate) {
dt[i, filter_criteria := new_criteria]
# covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
covered_genes = unique(c(covered_genes, row$to_geneID))
}
}
} else {
dt[candidates, filter_criteria := method]
# covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
}
}
}
# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")## [1] "MapMan4 assignment counts per method:"
## OrthoDB_MapMan4 RBH_MapMan4 FastOMA_MapMan4
## 18336 2176 2959
## #### #### #### ####
##
## FastOMA_MapMan4 FastOMA_OrthoDB FastOMA_RBH MCScanX
## 2959 2357 842 18615
## OrthoDB_FastOMA_RBH OrthoDB_MapMan4 OrthoDB_RBH RBH_MapMan4
## 3716 18336 1371 2176
## reject
## 33670
## #### #### #### ####
df = dt
data.table::fwrite(df,
paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'),
sep = '\t')
openxlsx::write.xlsx(df,
paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'),
asTable = TRUE)rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_kept = data.table::rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = c("no match", "100% match based", "partial match"),
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
)
)]
}), use.names = TRUE)
long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]
ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (after filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)]
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)##
## 100% match no match partial match
## 46348 1673 2351
##
## 100% match no match partial match
## 1 23953 606 1463
## 2 6492 726 426
## 3 7667 213 272
## 4 8236 128 190
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
"OrthoDB_FastOMA_RBH",
"FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
"OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Frequency by MapMan4_Match (after filter)",
x = "KG Criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'),
asTable = TRUE)
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) { # make flags
source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
source_cols = c("MCScanX", 'RBH', "FastOMA")
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
## [1] 19826
## [1] 20698
## [1] 1 261
## [1] 1 102
## [1] 78
## [1] 275
## #### #### #### ####
pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)##
## FastOMA_MapMan4 FastOMA_OrthoDB FastOMA_RBH MCScanX
## 125 110 23 768
## OrthoDB_FastOMA_RBH OrthoDB_MapMan4 OrthoDB_RBH RBH_MapMan4
## 123 139 56 34
## reject
## 1497
openxlsx::write.xlsx(pss_long,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'),
asTable = TRUE)params_list <- list(
plantName = 'pcox'
, plantNameOut = "pear"
, plantDirOut = file.path('..', 'reports', group, "pear")
, flag = 4
)
env <- new.env()
list2env(params_list, envir = env)<environment: 0x00000161d4175ca8>
##
##
## processing file: ./09_translate-child.Rmd
| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-151] | |…… | 11% | |……. | 15% [unnamed-chunk-152] | |……… | 19% | |……….. | 22% [unnamed-chunk-153] | |…………. | 26% | |…………… | 30% [unnamed-chunk-154] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-155] | |……………….. | 41% | |…………………. | 44% [unnamed-chunk-156] | |…………………… | 48% | |…………………….. | 52% [unnamed-chunk-157] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-158] | |…………………………. | 63% | |…………………………… | 67% [unnamed-chunk-159] | |…………………………….. | 70% | |………………………………. | 74% [unnamed-chunk-160] | |………………………………… | 78% | |………………………………….. | 81% [unnamed-chunk-161] | |……………………………………. | 85% | |…………………………………….. | 89% [unnamed-chunk-162] | |………………………………………. | 93% | |………………………………………… | 96% [unnamed-chunk-163] | |…………………………………………..| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'compara') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'MCScanX') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else cat ('Ignore source(s):', us, '\n')
}## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
##
## FastOMA MCScanX RBH
## 75969 33983 36090
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID"
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 12 × 5
## from_geneID from_plant to_geneID to_plant source
## <chr> <chr> <chr> <chr> <chr>
## 1 AT1G20520 ath Pyrco.da.v2a1.augustus.000230 pcox FastOMA
## 2 AT1G76210 ath Pyrco.da.v2a1.augustus.000230 pcox FastOMA
## 3 AT5G53090 ath Pyrco.da.v2a1.snap.445350 pcox FastOMA
## 4 AT5G53100 ath Pyrco.da.v2a1.snap.445350 pcox FastOMA
## 5 AT1G03900 ath Pyrco.da.v2a1.chr10A.089110 pcox MCScanX
## 6 AT1G03920 ath Pyrco.da.v2a1.chr10A.089090 pcox MCScanX
## 7 AT5G56870 ath Pyrco.da.v2a1.chr9A.225970 pcox MCScanX
## 8 AT5G57035 ath Pyrco.da.v2a1.chr9A.225390 pcox MCScanX
## 9 AT1G01030 ath Pyrco.da.v2a1.chr14A.371380 pcox RBH
## 10 AT1G01030 ath Pyrco.da.v2a1.chr1A.345960 pcox RBH
## 11 ATMG01250 ath Pyrco.da.v2a1.chr5A.045340 pcox RBH
## 12 ATMG01250 ath Pyrco.da.v2a1.snap.153710 pcox RBH
rm(list = setdiff(ls(), c("df",
"ath.gmm", "gn", "sn", "pss_long",
"plantName",
"plantNameOut",
"plantDirOut",
"pattern_in",
"pattern_out",
"mercator",
"mercatorPatternIn1",
"mercatorPatternOut1",
"mercatorPatternIn2",
"mercatorPatternOut2",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3761949 201.0 11915492 636.4 14894365 795.5
## Vcells 52119692 397.7 148652151 1134.2 232155515 1771.3
## [1] 21603
## [1] 30838
##
## FastOMA MCScanX RBH
## 75969 33983 36090
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {
source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
source_cols = c("MCScanX", 'RBH', "FastOMA")
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods") +
theme(legend.position = "none")
# Print or/and save the plot
print(upset_plot)# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) #
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) #
df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)
openxlsx::write.xlsx(nin,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]
colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "RBH" "from_count" "to_count" "count_evidence"
## [9] "ath_BINCODE" "ath_NAME" "ath_DESCRIPTION" "athName"
## [13] "athSynonims" "all_pathways" "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))##
## FALSE
## 95885
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "gn", "sn", "pss_long",
"plantName",
"plantNameOut",
"plantDirOut",
"pattern_in",
"pattern_out",
"mercator",
"mercatorPatternIn1",
"mercatorPatternOut1",
"mercatorPatternIn2",
"mercatorPatternOut2",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3685817 196.9 11915492 636.4 14894365 795.5
## Vcells 41892282 319.7 118921721 907.4 232155515 1771.3
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# split string by | then by ; and trim tokens,
# then truncate each token to first three dot-separated levels
split_tokens = function(code) {
if(is.na(code) || code == "") return(character(0))
parts = stringr::str_split(code, "\\|", simplify = TRUE)
tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
tokens = unique(stringr::str_trim(tokens))
# For each token, extract first 3 dot levels
trunc3levels = function(token) {
levels = unlist(stringr::str_split(token, "\\."))
if(length(levels) > 3) {
levels = levels[1:3]
}
paste(levels, collapse = ".")
}
truncated_tokens = sapply(tokens, trunc3levels)
unique(truncated_tokens)
}
bin_set = split_tokens(athMercator)
v4_set = split_tokens(plantXMercator)
# Tokens that are common between sets truncated to 3 levels
common_tokens = intersect(bin_set, v4_set)
# Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
if(length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
return(paste0("100% match based on ", bin_set))
}
# Check if sets are identical
if(setequal(bin_set, v4_set)) {
return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
}
# Partial match if any tokens overlap, mention those tokens
if(length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name
dplyr::ungroup()## #### #### before filter #### ####
## [1] 21603
## [1] 30838
## [1] 1 136
## [1] 1 116
## [1] 261
## [1] 287
## #### #### #### ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()
if (flag == 1) {
methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) { # make flags
methods = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
methods = c("MCScanX", 'RBH', "FastOMA")
}
match_categories = c("no match", "100% match based", "partial match")
long_dt = data.table::rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = c("no match", "100% match based", "partial match"),
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
)
)]
}), use.names = TRUE)
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match no match partial match
## 76754 10595 8536
##
## 100% match no match partial match
## 1 44198 9410 7873
## 2 17208 975 468
## 3 15348 210 195
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Frequency of count_evidence by MapMan4_Match",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
if (flag != 4) {
special_methods = c("OrthoDB", "RBH", "FastOMA")
} else {
special_methods = c("RBH", "FastOMA")
}
# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))
for (method in methods) {
base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE &
!(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
add_cond = rep(TRUE, nrow(dt))
if (method %in% special_methods) {
add_cond = rep(TRUE, nrow(dt))
}
candidates = which(base_cond & add_cond)
if (length(candidates) > 0) {
if (method %in% special_methods) {
for (i in candidates) {
row = dt[i]
covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
count_covered = length(covered_by)
is_candidate = FALSE
new_criteria = NULL
if (count_covered == 3) {
is_candidate = TRUE
new_criteria = "OrthoDB_FastOMA_RBH"
} else if (count_covered == 2) {
is_candidate = TRUE
new_criteria = paste(sort(covered_by), collapse = "_")
} else if (count_covered == 1) {
# Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
# reconsider
# (grepl("match based on", mapman_val, ignore.case = TRUE) &&
# !grepl("^100% match based on 35\\.2$", mapman_val)) # for flags 3
if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
is_candidate = TRUE
new_criteria = paste0(method, "_MapMan4")
# Increment count for this mapman4 assignment
mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
}
}
if (is_candidate) {
dt[i, filter_criteria := new_criteria]
# covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
covered_genes = unique(c(covered_genes, row$to_geneID))
}
}
} else {
dt[candidates, filter_criteria := method]
# covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
}
}
}
# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")## [1] "MapMan4 assignment counts per method:"
## RBH_MapMan4 FastOMA_MapMan4
## 4486 8656
## #### #### #### ####
##
## FastOMA_MapMan4 FastOMA_RBH MCScanX RBH_MapMan4 reject
## 8656 5612 33983 4486 43148
## #### #### #### ####
df = dt
data.table::fwrite(df,
paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'),
sep = '\t')
openxlsx::write.xlsx(df,
paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'),
asTable = TRUE)rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_kept = data.table::rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = c("no match", "100% match based", "partial match"),
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
)
)]
}), use.names = TRUE)
long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]
ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (after filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)]
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)##
## 100% match no match partial match
## 46863 3471 2403
##
## 100% match no match partial match
## 1 16410 2355 1804
## 2 15105 906 404
## 3 15348 210 195
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
"OrthoDB_FastOMA_RBH",
"FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
"OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Frequency by MapMan4_Match (after filter)",
x = "KG Criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'),
asTable = TRUE)
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) { # make flags
source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
source_cols = c("MCScanX", 'RBH', "FastOMA")
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
## [1] 19972
## [1] 30046
## [1] 1 44
## [1] 1 102
## [1] 12
## [1] 37
## #### #### #### ####
pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)##
## FastOMA_MapMan4 FastOMA_RBH MCScanX RBH_MapMan4 reject
## 332 175 1459 134 1204
openxlsx::write.xlsx(pss_long,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'),
asTable = TRUE)params_list <- list(
plantName = 'pcer'
, plantNameOut = "cherryplum"
, plantDirOut = file.path('..', 'reports', group, "cherryplum")
, flag = 4
)
# note: in compara - geneID and prot ID are completely different
env <- new.env()
list2env(params_list, envir = env)<environment: 0x0000016218a7ba58>
##
##
## processing file: ./09_translate-child.Rmd
| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-179] | |…… | 11% | |……. | 15% [unnamed-chunk-180] | |……… | 19% | |……….. | 22% [unnamed-chunk-181] | |…………. | 26% | |…………… | 30% [unnamed-chunk-182] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-183] | |……………….. | 41% | |…………………. | 44% [unnamed-chunk-184] | |…………………… | 48% | |…………………….. | 52% [unnamed-chunk-185] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-186] | |…………………………. | 63% | |…………………………… | 67% [unnamed-chunk-187] | |…………………………….. | 70% | |………………………………. | 74% [unnamed-chunk-188] | |………………………………… | 78% | |………………………………….. | 81% [unnamed-chunk-189] | |……………………………………. | 85% | |…………………………………….. | 89% [unnamed-chunk-190] | |………………………………………. | 93% | |………………………………………… | 96% [unnamed-chunk-191] | |…………………………………………..| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'compara') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'MCScanX') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else cat ('Ignore source(s):', us, '\n')
}## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
##
## FastOMA MCScanX RBH
## 162100 63358 80487
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID"
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 12 × 5
## from_geneID from_plant to_geneID to_plant source
## <chr> <chr> <chr> <chr> <chr>
## 1 AT2G32760 ath Pcer_000001-RA pcer FastOMA
## 2 AT1G07920 ath Pcer_000002-RA pcer FastOMA
## 3 AT1G16650 ath Pcer_097557-RA pcer FastOMA
## 4 AT1G53530 ath Pcer_097558-RA pcer FastOMA
## 5 AT1G04220 ath Pcer_000137-RA pcer MCScanX
## 6 AT1G04230 ath Pcer_000150-RA pcer MCScanX
## 7 ATMG00080 ath Pcer_091446-RA pcer MCScanX
## 8 ATMG00513 ath Pcer_091469-RA pcer MCScanX
## 9 AT1G01030 ath Pcer_027461-RA pcer RBH
## 10 AT1G01030 ath Pcer_038773-RA pcer RBH
## 11 ATMG01330 ath Pcer_091451-RA pcer RBH
## 12 ATMG01360 ath Pcer_096779-RA pcer RBH
rm(list = setdiff(ls(), c("df",
"ath.gmm", "gn", "sn", "pss_long",
"plantName",
"plantNameOut",
"plantDirOut",
"pattern_in",
"pattern_out",
"mercator",
"mercatorPatternIn1",
"mercatorPatternOut1",
"mercatorPatternIn2",
"mercatorPatternOut2",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3510935 187.6 11915492 636.4 14894365 795.5
## Vcells 44978395 343.2 118921721 907.4 232155515 1771.3
## [1] 22197
## [1] 71437
##
## FastOMA MCScanX RBH
## 162100 63358 80487
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {
source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
source_cols = c("MCScanX", 'RBH', "FastOMA")
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods") +
theme(legend.position = "none")
# Print or/and save the plot
print(upset_plot)# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) #
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) #
df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)
openxlsx::write.xlsx(nin,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]
colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "RBH" "from_count" "to_count" "count_evidence"
## [9] "ath_BINCODE" "ath_NAME" "ath_DESCRIPTION" "athName"
## [13] "athSynonims" "all_pathways" "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))##
## FALSE TRUE
## 201293 3
## [1] "Pcer_097367-RB" "Pcer_097392-RB" "Pcer_097544-RB"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "gn", "sn", "pss_long",
"plantName",
"plantNameOut",
"plantDirOut",
"pattern_in",
"pattern_out",
"mercator",
"mercatorPatternIn1",
"mercatorPatternOut1",
"mercatorPatternIn2",
"mercatorPatternOut2",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4201617 224.4 11915492 636.4 14894365 795.5
## Vcells 57476412 438.6 118921721 907.4 232155515 1771.3
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# split string by | then by ; and trim tokens,
# then truncate each token to first three dot-separated levels
split_tokens = function(code) {
if(is.na(code) || code == "") return(character(0))
parts = stringr::str_split(code, "\\|", simplify = TRUE)
tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
tokens = unique(stringr::str_trim(tokens))
# For each token, extract first 3 dot levels
trunc3levels = function(token) {
levels = unlist(stringr::str_split(token, "\\."))
if(length(levels) > 3) {
levels = levels[1:3]
}
paste(levels, collapse = ".")
}
truncated_tokens = sapply(tokens, trunc3levels)
unique(truncated_tokens)
}
bin_set = split_tokens(athMercator)
v4_set = split_tokens(plantXMercator)
# Tokens that are common between sets truncated to 3 levels
common_tokens = intersect(bin_set, v4_set)
# Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
if(length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
return(paste0("100% match based on ", bin_set))
}
# Check if sets are identical
if(setequal(bin_set, v4_set)) {
return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
}
# Partial match if any tokens overlap, mention those tokens
if(length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name
dplyr::ungroup()## #### #### before filter #### ####
## [1] 22197
## [1] 71437
## [1] 1 270
## [1] 1 113
## [1] 890
## [1] 449
## #### #### #### ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()
if (flag == 1) {
methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) { # make flags
methods = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
methods = c("MCScanX", 'RBH', "FastOMA")
}
match_categories = c("no match", "100% match based", "partial match")
long_dt = data.table::rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = c("no match", "100% match based", "partial match"),
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
)
)]
}), use.names = TRUE)
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match no match partial match
## 166873 19225 15198
##
## 100% match no match partial match
## 1 97070 16707 13585
## 2 39984 1995 1240
## 3 29819 523 373
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Frequency of count_evidence by MapMan4_Match",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
if (flag != 4) {
special_methods = c("OrthoDB", "RBH", "FastOMA")
} else {
special_methods = c("RBH", "FastOMA")
}
# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))
for (method in methods) {
base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE &
!(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
add_cond = rep(TRUE, nrow(dt))
if (method %in% special_methods) {
add_cond = rep(TRUE, nrow(dt))
}
candidates = which(base_cond & add_cond)
if (length(candidates) > 0) {
if (method %in% special_methods) {
for (i in candidates) {
row = dt[i]
covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
count_covered = length(covered_by)
is_candidate = FALSE
new_criteria = NULL
if (count_covered == 3) {
is_candidate = TRUE
new_criteria = "OrthoDB_FastOMA_RBH"
} else if (count_covered == 2) {
is_candidate = TRUE
new_criteria = paste(sort(covered_by), collapse = "_")
} else if (count_covered == 1) {
# Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
# reconsider
# (grepl("match based on", mapman_val, ignore.case = TRUE) &&
# !grepl("^100% match based on 35\\.2$", mapman_val)) # for flags 3
if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
is_candidate = TRUE
new_criteria = paste0(method, "_MapMan4")
# Increment count for this mapman4 assignment
mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
}
}
if (is_candidate) {
dt[i, filter_criteria := new_criteria]
# covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
covered_genes = unique(c(covered_genes, row$to_geneID))
}
}
} else {
dt[candidates, filter_criteria := method]
# covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
}
}
}
# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")## [1] "MapMan4 assignment counts per method:"
## RBH_MapMan4 FastOMA_MapMan4
## 12103 25962
## #### #### #### ####
##
## FastOMA_MapMan4 FastOMA_RBH MCScanX RBH_MapMan4 reject
## 25962 18052 63358 12103 81821
## #### #### #### ####
df = dt
data.table::fwrite(df,
paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'),
sep = '\t')
openxlsx::write.xlsx(df,
paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'),
asTable = TRUE)rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_kept = data.table::rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = c("no match", "100% match based", "partial match"),
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
)
)]
}), use.names = TRUE)
long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]
ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (after filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)]
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)##
## 100% match no match partial match
## 108487 4446 6542
##
## 100% match no match partial match
## 1 42973 2064 5067
## 2 35695 1859 1102
## 3 29819 523 373
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
"OrthoDB_FastOMA_RBH",
"FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
"OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Frequency by MapMan4_Match (after filter)",
x = "KG Criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'),
asTable = TRUE)
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) { # make flags
source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
source_cols = c("MCScanX", 'RBH', "FastOMA")
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
## [1] 20941
## [1] 69011
## [1] 1 258
## [1] 1 102
## [1] 282
## [1] 78
## #### #### #### ####
pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)##
## FastOMA_MapMan4 FastOMA_RBH MCScanX RBH_MapMan4 reject
## 956 583 2541 351 3036
openxlsx::write.xlsx(pss_long,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'),
asTable = TRUE)params_list <- list(
plantName = 'psib'
, plantNameOut = "siberianapricot"
, plantDirOut = file.path('..', 'reports', group, "siberianapricot")
, flag = 4
)
env <- new.env()
list2env(params_list, envir = env)<environment: 0x00000161cd798320>
##
##
## processing file: ./09_translate-child.Rmd
| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-207] | |…… | 11% | |……. | 15% [unnamed-chunk-208] | |……… | 19% | |……….. | 22% [unnamed-chunk-209] | |…………. | 26% | |…………… | 30% [unnamed-chunk-210] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-211] | |……………….. | 41% | |…………………. | 44% [unnamed-chunk-212] | |…………………… | 48% | |…………………….. | 52% [unnamed-chunk-213] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-214] | |…………………………. | 63% | |…………………………… | 67% [unnamed-chunk-215] | |…………………………….. | 70% | |………………………………. | 74% [unnamed-chunk-216] | |………………………………… | 78% | |………………………………….. | 81% [unnamed-chunk-217] | |……………………………………. | 85% | |…………………………………….. | 89% [unnamed-chunk-218] | |………………………………………. | 93% | |………………………………………… | 96% [unnamed-chunk-219] | |…………………………………………..| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'compara') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'MCScanX') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else cat ('Ignore source(s):', us, '\n')
}## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
##
## FastOMA MCScanX RBH
## 40732 16398 25288
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID"
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 12 × 5
## from_geneID from_plant to_geneID to_plant source
## <chr> <chr> <chr> <chr> <chr>
## 1 AT1G12040 ath PaF106G0100000005 psib FastOMA
## 2 AT1G62440 ath PaF106G0100000005 psib FastOMA
## 3 AT3G07140 ath PaF106G0800032954 psib FastOMA
## 4 AT3G07140 ath PaF106G0800032956 psib FastOMA
## 5 AT1G04230 ath PaF106G0100000358 psib MCScanX
## 6 AT1G04240 ath PaF106G0100000361 psib MCScanX
## 7 ATCG00340 ath PaF106G0700028636 psib MCScanX
## 8 ATCG00350 ath PaF106G0700028635 psib MCScanX
## 9 AT1G01030 ath PaF106G0500020091 psib RBH
## 10 AT1G01040 ath PaF106G0200009357 psib RBH
## 11 ATMG01190 ath PaF106G0600023358 psib RBH
## 12 ATMG01250 ath PaF106G0700028671 psib RBH
rm(list = setdiff(ls(), c("df",
"ath.gmm", "gn", "sn", "pss_long",
"plantName",
"plantNameOut",
"plantDirOut",
"pattern_in",
"pattern_out",
"mercator",
"mercatorPatternIn1",
"mercatorPatternOut1",
"mercatorPatternIn2",
"mercatorPatternOut2",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3867738 206.6 9532394 509.1 14894365 795.5
## Vcells 62767524 478.9 118926637 907.4 232155515 1771.3
## [1] 21427
## [1] 19982
##
## FastOMA MCScanX RBH
## 40732 16398 25288
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) {
source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
source_cols = c("MCScanX", 'RBH', "FastOMA")
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods") +
theme(legend.position = "none")
# Print or/and save the plot
print(upset_plot)# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, gn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) #
df = merge(df, sn, by.x = 'from_geneID', by.y = 'V1', all.x = TRUE) #
df = merge(df, pss_long, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = pss_long[which(!(pss_long$id %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
nin = merge(nin, gn, by.x = 'id', by.y = 'V1', all.x = TRUE)
nin = merge(nin, sn, by.x = 'id', by.y = 'V1', all.x = TRUE)
openxlsx::write.xlsx(nin,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]
colnames(combined)[2:4] = paste('fruitTrees', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "RBH" "from_count" "to_count" "count_evidence"
## [9] "ath_BINCODE" "ath_NAME" "ath_DESCRIPTION" "athName"
## [13] "athSynonims" "all_pathways" "short_name"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$fruitTrees_BINCODE))##
## FALSE
## 53638
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "gn", "sn", "pss_long",
"plantName",
"plantNameOut",
"plantDirOut",
"pattern_in",
"pattern_out",
"mercator",
"mercatorPatternIn1",
"mercatorPatternOut1",
"mercatorPatternIn2",
"mercatorPatternOut2",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3482212 186.0 9532394 509.1 14894365 795.5
## Vcells 35820380 273.3 118926637 907.4 232155515 1771.3
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# split string by | then by ; and trim tokens,
# then truncate each token to first three dot-separated levels
split_tokens = function(code) {
if(is.na(code) || code == "") return(character(0))
parts = stringr::str_split(code, "\\|", simplify = TRUE)
tokens = unlist(lapply(parts, function(p) stringr::str_split(p, ";", simplify = TRUE)))
tokens = unique(stringr::str_trim(tokens))
# For each token, extract first 3 dot levels
trunc3levels = function(token) {
levels = unlist(stringr::str_split(token, "\\."))
if(length(levels) > 3) {
levels = levels[1:3]
}
paste(levels, collapse = ".")
}
truncated_tokens = sapply(tokens, trunc3levels)
unique(truncated_tokens)
}
bin_set = split_tokens(athMercator)
v4_set = split_tokens(plantXMercator)
# Tokens that are common between sets truncated to 3 levels
common_tokens = intersect(bin_set, v4_set)
# Check if plantXMercator is exact duplication of athMercator token(s) (all plantXMercator tokens equal truncated bin_set token(s))
v4_parts = stringr::str_split(plantXMercator, "\\|", simplify = TRUE)
if(length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
return(paste0("100% match based on ", bin_set))
}
# Check if sets are identical
if(setequal(bin_set, v4_set)) {
return(paste0("100% match based on ", paste(bin_set, collapse = ", ")))
}
# Partial match if any tokens overlap, mention those tokens
if(length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, fruitTrees_BINCODE)) %>% # change name
dplyr::ungroup()## #### #### before filter #### ####
## [1] 21427
## [1] 19982
## [1] 1 58
## [1] 1 115
## [1] 132
## [1] 105
## #### #### #### ####
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()
if (flag == 1) {
methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) { # make flags
methods = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
methods = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
methods = c("MCScanX", 'RBH', "FastOMA")
}
match_categories = c("no match", "100% match based", "partial match")
long_dt = data.table::rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = c("no match", "100% match based", "partial match"),
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
)
)]
}), use.names = TRUE)
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match no match partial match
## 44419 5632 3587
##
## 100% match no match partial match
## 1 25281 4776 3224
## 2 11051 631 252
## 3 8087 225 111
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Frequency of count_evidence by MapMan4_Match",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
if (flag != 4) {
special_methods = c("OrthoDB", "RBH", "FastOMA")
} else {
special_methods = c("RBH", "FastOMA")
}
# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))
for (method in methods) {
base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE &
!(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
add_cond = rep(TRUE, nrow(dt))
if (method %in% special_methods) {
add_cond = rep(TRUE, nrow(dt))
}
candidates = which(base_cond & add_cond)
if (length(candidates) > 0) {
if (method %in% special_methods) {
for (i in candidates) {
row = dt[i]
covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
count_covered = length(covered_by)
is_candidate = FALSE
new_criteria = NULL
if (count_covered == 3) {
is_candidate = TRUE
new_criteria = "OrthoDB_FastOMA_RBH"
} else if (count_covered == 2) {
is_candidate = TRUE
new_criteria = paste(sort(covered_by), collapse = "_")
} else if (count_covered == 1) {
# Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
# reconsider
# (grepl("match based on", mapman_val, ignore.case = TRUE) &&
# !grepl("^100% match based on 35\\.2$", mapman_val)) # for flags 3
if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
is_candidate = TRUE
new_criteria = paste0(method, "_MapMan4")
# Increment count for this mapman4 assignment
mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
}
}
if (is_candidate) {
dt[i, filter_criteria := new_criteria]
# covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID))
covered_genes = unique(c(covered_genes, row$to_geneID))
}
}
} else {
dt[candidates, filter_criteria := method]
# covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)]))
covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
}
}
}
# After the loop, print checkpoint counts for method_MapMan4 assignments
print("MapMan4 assignment counts per method:")## [1] "MapMan4 assignment counts per method:"
## RBH_MapMan4 FastOMA_MapMan4
## 4572 4786
## #### #### #### ####
##
## FastOMA_MapMan4 FastOMA_RBH MCScanX RBH_MapMan4 reject
## 4786 5843 16398 4572 22039
## #### #### #### ####
df = dt
data.table::fwrite(df,
paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'),
sep = '\t')
openxlsx::write.xlsx(df,
paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'),
asTable = TRUE)rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "df$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "kept$from_count", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = "df$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = "kept$to_count", xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_kept = data.table::rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = c("no match", "100% match based", "partial match"),
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "100% match based")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "partial match"))
)
)]
}), use.names = TRUE)
long_kept[, Match_Type := factor(Match_Type, levels = c("no match", "partial match", "100% match based"))]
ggplot2::ggplot(long_kept, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (after filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(kept), value = TRUE)]
keptsub$MapMan4_Match = sub('based on.*', '', keptsub$MapMan4_Match)
table(keptsub$MapMan4_Match)##
## 100% match no match partial match
## 28419 1706 1474
##
## 100% match no match partial match
## 1 10809 900 1152
## 2 9523 581 211
## 3 8087 225 111
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = as.character(tab$MapMan4_Match)
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match ', '100% match '))
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Frequency of count_evidence by MapMan4_Match (after filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = factor(tab$MapMan4_Match, levels = c('no match', 'partial match', '100% match'))
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
"OrthoDB_FastOMA_RBH",
"FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
"OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Frequency by MapMan4_Match (after filter)",
x = "KG Criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/', group, "/", plantNameOut, '/y_', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/y_', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'),
asTable = TRUE)
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 2) { # make flags
source_cols = c("MCScanX", "compara", 'OrthoDB', 'RBH', "FastOMA")
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', 'RBH', "FastOMA")
} else {
source_cols = c("MCScanX", 'RBH', "FastOMA")
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
## [1] 18899
## [1] 19230
## [1] 1 46
## [1] 1 102
## [1] 7
## [1] 16
## #### #### #### ####
pss_long = pss_long[, grep("id$|all_pathways$|short_name$", colnames(pss_long))]
pss_long = pss_long[!duplicated(pss_long), ]
pss_long = merge(pss_long,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss_long = pss_long[grep('^AT', pss_long$id), ]
pss_long = pss_long[!duplicated(pss_long), ]
table(pss_long$filter_criteria)##
## FastOMA_MapMan4 FastOMA_RBH MCScanX RBH_MapMan4 reject
## 200 179 674 146 789
openxlsx::write.xlsx(pss_long,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'),
asTable = TRUE)# Step 1: params_list
# params_list <- list(
# ...
# )
#
# Step 2: YAML header in 09_fruitTrees-child.Rmd
# ---
# title: "fruitTrees Child"
# output: html_document
# params:
# plantName1: NULL
# plantName2: NULL
# plantName3: NULL
# plantName4: NULL
# plantDirIn: NULL
# plantNameOut: NULL
# plantDirOut: NULL
# pattern_in: NULL
# pattern_out: NULL
# compara_pattern_in1: NULL
# compara_pattern_in2: NULL
# plaza_pattern_in1: NULL
# plaza_pattern_in2: NULL
# ref_genome: NULL
# mercator: NULL
# mercatorPatternIn1: NULL
# mercatorPatternOut1: NULL
# mercatorPatternIn2: NULL
# mercatorPatternOut2: NULL
# ---
#
#
# access params in the script like:
# params$plantName1
# params$plantDirOut
#
# Step 3: Call render() like
# rmarkdown::render(
# input = "09_fruitTrees-child.Rmd",
# params = params_list,
# envir = new.env(parent = globalenv()), # optional: isolate execution
# output_format = "html_document",
# quiet = FALSE
# )
#
#
# This will:
# Inject params_list into params$...
# Knit the child Rmd in a separate process
# Print progress to the console (quiet = FALSE)
# Save an .html file to the working directory## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=Slovenian_Slovenia.utf8 LC_CTYPE=Slovenian_Slovenia.utf8
## [3] LC_MONETARY=Slovenian_Slovenia.utf8 LC_NUMERIC=C
## [5] LC_TIME=Slovenian_Slovenia.utf8
##
## time zone: Europe/Warsaw
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ComplexUpset_1.3.3 ggplot2_4.0.0 knitr_1.50 data.table_1.17.8
## [5] magrittr_2.0.4
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 jsonlite_2.0.0 crayon_1.5.3 dplyr_1.1.4
## [5] compiler_4.5.1 zip_2.3.3 Rcpp_1.1.0 tidyselect_1.2.1
## [9] stringr_1.5.2 jquerylib_0.1.4 scales_1.4.0 yaml_2.3.10
## [13] fastmap_1.2.0 R6_2.6.1 labeling_0.4.3 generics_0.1.4
## [17] patchwork_1.3.2 igraph_2.1.4 openxlsx_4.2.8 tibble_3.3.0
## [21] bslib_0.9.0 pillar_1.11.1 RColorBrewer_1.1-3 rlang_1.1.6
## [25] utf8_1.2.6 cachem_1.1.0 stringi_1.8.7 xfun_0.53
## [29] sass_0.4.10 S7_0.2.0 cli_3.6.5 withr_3.0.2
## [33] digest_0.6.37 grid_4.5.1 rstudioapi_0.17.1 lifecycle_1.0.4
## [37] vctrs_0.6.5 evaluate_1.0.5 glue_1.8.0 farver_2.1.2
## [41] colorspace_2.1-1 rmarkdown_2.29 tools_4.5.1 pkgconfig_2.0.3
## [45] htmltools_0.5.8.1